In this study I am going to forecast the number of unemployed people in Spain for the following 12 months.

Loading the basic libraries that I am going to use

library(forecast)
library(TSstudio)
library(plotly)
library(tidyverse)
library(TSstudio)
library(plotly)
library(stats)
library(forecast)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(dygraphs)
library(lubridate)
library(datasets)
library(base)
library(h2o)

Loading the data into R

setwd("C:/Users/marct/OneDrive - Tecnocampus Mataro-Maresme/Documentos/CURSOS/PROJECTES/TIME SERIES ANALYSIS/paroesp")
paroesp <- read.csv("C:/Users/marct/OneDrive - Tecnocampus Mataro-Maresme/Documentos/CURSOS/PROJECTES/TIME SERIES ANALYSIS/paroesp/paroesp.csv", sep = ";", dec = ".")

Data preparation

paroesp <- paroesp[1:230,1:3]
colnames(paroesp) <- c("Year", "Month", "y")
paroesp$y <- as.numeric(gsub(",", ".", gsub("\\.", "", paroesp$y)))
paroesp$month_number <- ifelse(paroesp$Month == "Enero", 1,
                               ifelse(paroesp$Month == "Febrero", 2,
                                      ifelse(paroesp$Month == "Marzo", 3,
                                             ifelse(paroesp$Month == "Abril", 4,
                                                    ifelse(paroesp$Month == "Mayo", 5,
                                                           ifelse(paroesp$Month == "Junio", 6,
                                                                  ifelse(paroesp$Month == "Julio", 7,
                                                                         ifelse(paroesp$Month == "Agosto", 8,
                                                                                ifelse(paroesp$Month == "Septiembre", 9,
                                                                                       ifelse(paroesp$Month == "Octubre", 10,
                                                                                              ifelse(paroesp$Month == "Noviembre", 11,
                                                                                                     ifelse(paroesp$Month == "Diciembre", 12, "Nulo"))))))))))))

Now we create the date object from the year, month and day. We suppose that every record is recorded on the first day of each month, in order to create the time series. object

##         date       y
## 1 2001-01-01 2017389
## 2 2001-02-01 1993273
## 3 2001-03-01 1981006
## 4 2001-04-01 1910453
## 5 2001-05-01 1898285
## 6 2001-06-01 1842556

Creating the time series

paroesp_ts <- ts(paroesp[,2], start = c(lubridate::year(min(paroesp$date)), lubridate::month(min(paroesp$date))), frequency = 12)

Exploratory data analysis

ts_decompose(paroesp_ts)
ts_plot(paroesp_ts,
        title = "Unemployed people in Spain, 2000-2020",
        Xtitle = "Year",
        Ytitle = "N?mero of unemployed")

We can see that the series has two very different cycles (before 2013 and after). Since we want to forecast the following year, we only will data after 2013, because we don’t want to introduce noise to model. Also, the series has an additive structure.

paroesp_ts_2013 <- window(paroesp_ts, start = c(2013,1))
ts_info(paroesp_ts_2013)
##  The paroesp_ts_2013 series is a ts object with 1 variable and 86 observations
##  Frequency: 12 
##  Start time: 2013 1 
##  End time: 2020 2
ts_plot(paroesp_ts_2013, 
        title = "Unemployed people in Spain",
        Xtitle = "Year",
        Ytitle = "Number of unemployed people")

Seasonality analysis

ggseasonplot(paroesp_ts_2013, year.labels = TRUE, continuous = T) 

ggseasonplot(paroesp_ts_2013, polar = TRUE)

ts_seasonal(paroesp_ts_2013, type = "normal")
ts_seasonal(paroesp_ts_2013, type = "cycle") 
ts_seasonal(paroesp_ts_2013, type = "box")

The series has a clear seasonal component (during summer the number of unemployed people reduces due to tourism)

Correlation analysis

par(mfrow = c(1,2))
acf(paroesp_ts_2013)
pacf(paroesp_ts_2013)

Since there is a linear trend in the first acf plot, this is an indicator that the series is non-stationary and in order to see the true relationship between the series and its lags, we detrend it.

par(mfrow = c(1,2))
acf(diff(paroesp_ts_2013, 1), lag.max = 60) 
pacf(diff(paroesp_ts_2013, 1), lag.max = 60)

ts_lags(paroesp_ts_2013, lags = c(1, 12, 24, 36))

As we can see there is a strong relationship between the series and its first non-seasonal lag, and its first and second seasonal lags.

We split the series into training and test partitions.

paro_esp_13_split <- ts_split(paroesp_ts_2013, sample.out = 12)
train <- paro_esp_13_split$train
test <- paro_esp_13_split$test 

First approach: Linear Regression (Season and Trend)

md_lm <- tslm(train ~ season + trend)
summary(md_lm)
## 
## Call:
## tslm(formula = train ~ season + trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -127389  -38069    5981   34427  109877 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5135004.0    23519.8 218.327  < 2e-16 ***
## season2       31670.4    29308.8   1.081   0.2841    
## season3        1073.1    30527.7   0.035   0.9721    
## season4      -68106.5    30517.3  -2.232   0.0293 *  
## season5     -148532.6    30509.9  -4.868 8.31e-06 ***
## season6     -231244.2    30505.5  -7.580 2.32e-10 ***
## season7     -255532.9    30504.0  -8.377 9.90e-12 ***
## season8     -205762.3    30505.5  -6.745 6.34e-09 ***
## season9     -155178.6    30509.9  -5.086 3.74e-06 ***
## season10     -61302.6    30517.3  -2.009   0.0490 *  
## season11     -36794.4    30527.7  -1.205   0.2328    
## season12     -81071.1    30541.0  -2.655   0.0101 *  
## trend        -26837.3      300.6 -89.272  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54830 on 61 degrees of freedom
## Multiple R-squared:  0.9927, Adjusted R-squared:  0.9913 
## F-statistic: 690.8 on 12 and 61 DF,  p-value: < 2.2e-16
fc_lm <- forecast(md_lm, h = 12)
accuracy(fc_lm, test)
##                         ME      RMSE       MAE         MPE     MAPE
## Training set -2.516685e-11  49780.41  40212.91 -0.01035867 1.057270
## Test set      2.688037e+05 283573.35 268803.75  8.53400059 8.534001
##                   MASE      ACF1 Theil's U
## Training set 0.1300801 0.8384918        NA
## Test set     0.8695225 0.7802304  4.746687
test_forecast(actual = paroesp_ts_2013,
              forecast.obj = fc_lm,
              test = test)
checkresiduals(md_lm) 

## 
##  Breusch-Godfrey test for serial correlation of order up to 16
## 
## data:  Residuals from Linear regression model
## LM test = 60.888, df = 16, p-value = 3.706e-07

MAPE in the test partition is nearly 7 times higher than in the training test, so the model may be overfitting. In addition residuals aren’t white noise (because they’re correlated), and therefore the model couldn’t capture all the patterns of data. Finally residuals aren’t normally distributed. Due all this reasons we won’t consider this model to forecast our data.

Second approach: Linear Regression (Season, trend, summerseason, springbreak and christmas)

We create a data frame to store the variables we are going to regress against with. These variables are springbreak (a binary variable with value = 1 if the month is April), summerseason (a variable with value 1 if the months are May, June, July, August or September), and christmas (following the same criteria, but with December and January). I have tried to use these variables, because these are special seasons where usually unemployment is decreased.

paroesp_13_df <- filter(paroesp, lubridate::year(date) >= 2013) 
paroesp_13_df$summerseason <- ifelse(lubridate::month(paroesp_13_df$date) == 5, 1,
                                     ifelse(lubridate::month(paroesp_13_df$date) == 6, 1,
                                            ifelse(lubridate::month(paroesp_13_df$date) == 7, 1,
                                                   ifelse(lubridate::month(paroesp_13_df$date) == 8, 1,
                                                          ifelse(lubridate::month(paroesp_13_df$date) == 9, 1, 0)))))
paroesp_13_df$springbreak <- ifelse(lubridate::month(paroesp_13_df$date) == 4, 1, 0)
paroesp_13_df$christmas <- ifelse(lubridate::month(paroesp_13_df$date) == 12, 1,
                                  ifelse(lubridate::month(paroesp_13_df$date) == 1, 1, 0))
train_df <- paroesp_13_df[1:(nrow(paroesp_13_df) - 12),]
test_df <- paroesp_13_df[(nrow(paroesp_13_df) - 12 + 1): nrow(paroesp_13_df),]

md_lm2 <- tslm(train ~ trend + season + summerseason + springbreak + christmas, data = train_df)
summary(md_lm2)
## 
## Call:
## tslm(formula = train ~ trend + season + summerseason + springbreak + 
##     christmas, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -127389  -38069    5981   34427  109877 
## 
## Coefficients: (3 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5135004.0    23519.8 218.327  < 2e-16 ***
## trend         -26837.3      300.6 -89.272  < 2e-16 ***
## season2        31670.4    29308.8   1.081   0.2841    
## season3         1073.1    30527.7   0.035   0.9721    
## season4       -68106.5    30517.3  -2.232   0.0293 *  
## season5      -148532.6    30509.9  -4.868 8.31e-06 ***
## season6      -231244.2    30505.5  -7.580 2.32e-10 ***
## season7      -255532.9    30504.0  -8.377 9.90e-12 ***
## season8      -205762.3    30505.5  -6.745 6.34e-09 ***
## season9      -155178.6    30509.9  -5.086 3.74e-06 ***
## season10      -61302.6    30517.3  -2.009   0.0490 *  
## season11      -36794.4    30527.7  -1.205   0.2328    
## season12      -81071.1    30541.0  -2.655   0.0101 *  
## summerseason        NA         NA      NA       NA    
## springbreak         NA         NA      NA       NA    
## christmas           NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54830 on 61 degrees of freedom
## Multiple R-squared:  0.9927, Adjusted R-squared:  0.9913 
## F-statistic: 690.8 on 12 and 61 DF,  p-value: < 2.2e-16
fc_lm2 <- forecast(md_lm2, h = 12, newdata = test_df)
## Warning in predict.lm(predict_object, newdata = newdata, se.fit = TRUE, :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(predict_object, newdata = newdata, se.fit = TRUE, :
## prediction from a rank-deficient fit may be misleading
accuracy(fc_lm2, test)
##                        ME      RMSE       MAE         MPE     MAPE
## Training set 5.034272e-11  49780.41  40212.91 -0.01035867 1.057270
## Test set     2.688037e+05 283573.35 268803.75  8.53400059 8.534001
##                   MASE      ACF1 Theil's U
## Training set 0.1300801 0.8384918        NA
## Test set     0.8695225 0.7802304  4.746687
test_forecast(actual = paroesp_ts_2013,
              forecast.obj = fc_lm2,
              test = test)
checkresiduals(md_lm2) 

## 
##  Breusch-Godfrey test for serial correlation of order up to 19
## 
## data:  Residuals from Linear regression model
## LM test = 61.081, df = 19, p-value = 2.607e-06

The same problem that happenned with our first regression model occurs with this one, and also the new variables aren’t statistically significant.

Forecasting with Holt’s Method

paro_esp_13_split <- ts_split(paroesp_ts_2013, sample.out = 12)
train <- paro_esp_13_split$train
test <- paro_esp_13_split$test 

fc_holt <- holt(train, h = 12, initial = "optimal")
fc_holt$model
## Holt's method 
## 
## Call:
##  holt(y = train, h = 12, initial = "optimal") 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 5077214.9216 
##     b = -25938.1357 
## 
##   sigma:  66819.92
## 
##      AIC     AICc      BIC 
## 1968.633 1969.515 1980.153
accuracy(fc_holt, test) 
##                     ME      RMSE       MAE        MPE     MAPE      MASE
## Training set  1781.273  64988.89  56234.01 0.05580604 1.414807 0.1819050
## Test set     21985.364 147390.51 126416.34 0.59537806 4.012773 0.4089298
##                   ACF1 Theil's U
## Training set 0.3733410        NA
## Test set     0.7832341  2.471319
checkresiduals(fc_holt)

## 
##  Ljung-Box test
## 
## data:  Residuals from Holt's method
## Q* = 119.61, df = 11, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 15

Now we try to predict the values in the test partition with this model

test_forecast(actual = paroesp_ts_2013,
              forecast.obj = fc_holt,
              test = test)

The conclusion is similar to the linear regressions, the model is overfitting, the residuals aren’t white noise and they’re correlated with their lags.

Forecasting with ARIMA (and SARIMA) models

We first plot the autocorrelation function and the partial autocorrelation function plots.

par(mfrow = c(1, 2))
acf(paroesp_ts_2013, lag.max = 60)
pacf(paroesp_ts_2013, lag.max = 60)

As we can see in the acf, the correlation of the series and its lags is slowly decaying in a linear manner. Therefore the series is not stationary. Due to this, we aren’t able to extract a lot of information of the real relationship between the series and its lags. That’s why we differentiate the series with its first non-seasonal lag (in order to detrend it)

paroesp13_d1 <- diff(train, 1)
par(mfrow = c(1,2))
acf(paroesp13_d1, lag.max = 60)
pacf(paroesp13_d1, lag.max = 60)

ts_plot(paroesp13_d1,
        title = "Unemployed people in Spain - First Difference",
        Xtitle = "Year",
        Ytitle = "Number of Unemployed people") 

We can see there’s still a bit of variance, to try to stabilize I try two different approaches

1. Taking the second order non-seasonal difference

paroesp13_d2 <- diff(paroesp13_d1, 1)
par(mfrow = c(1,2))
acf(paroesp13_d2, lag.max = 60)
pacf(paroesp13_d2, lag.max = 60)

ts_plot(paroesp13_d2,
        title = "Unemployed people in Spain - Second non-seasonal difference",
        Xtitle = "Year",
        Ytitle = "Number of Unemployed people")

As we can see in the last plot this process doesn’t decrease the variance, it increases it (so we discard it)

2. Taking the first non-seaonal difference and the first seasonal difference

paroesp13_d1_d12 <- diff(paroesp13_d1, 12)
par(mfrow = c(1, 2))
acf(paroesp13_d1_d12, lag.max = 60)
pacf(paroesp13_d1_d12, lag.max = 60)

ts_plot(paroesp13_d1_d12)

It also increases the variance, so we discard it

Tuning the parameters

Since the series has a lot of correlation with its seasonal lags, we will try to fit an SARIMA model. We create a function that fits arima models, for different values of p,d,q,P,D,Q and gives the AIC(error metric) for each one. The less AIC has de model the better it will perform. We select the top three models with lowest AIC.

p <- q <- P <- Q <- 0:2
d <- 1
D <- 0
arima_grid <- expand.grid(p,d,q,P,D,Q)
names(arima_grid) <- c("p", "d", "q", "P", "D", "Q")
arima_grid$k <- rowSums(arima_grid)
arima_grid <- filter(arima_grid, k <= 3)

arima_search <- lapply(1:nrow(arima_grid), function(i){
        md <- NULL
        md <- arima(train, order = c(arima_grid$p[i], arima_grid$d[i], arima_grid$q[i]), 
                    seasonal = list(order = c(arima_grid$P[i], arima_grid$D[i], arima_grid$Q[i])),
                    method = "ML")
        
        results <- data.frame(p = arima_grid$p[i], d = arima_grid$d[i], q = arima_grid$q[i],
                              P = arima_grid$P[i], D = arima_grid$D[i], Q = arima_grid$Q[i],
                              AIC = md$aic)
}) %>% bind_rows() %>% arrange(AIC) 
arima_search[1:3,]
##   p d q P D Q      AIC
## 1 0 1 0 1 0 1 1710.117
## 2 0 1 0 2 0 0 1711.764
## 3 0 1 0 1 0 0 1711.780

Our function tells us that the three models with less AIC are in ascending order: SARIMA(0,1,0)(1,0,1), SARIMA(0,1,0)(2,0,0) y SARIMA(0,1,0)(1,0,0). We will compare the performance of all three and select the best one.

arima_md1 <- arima(train, order = c(0,1,0), seasonal = list(order = c(1,0,1)))
fc_arima_md1 <- forecast(arima_md1, h = 12)
accuracy(fc_arima_md1, test)
##                     ME      RMSE      MAE         MPE      MAPE       MASE
## Training set -2778.927  23363.15 18076.83 -0.04248864 0.4453621 0.05847466
## Test set     99763.519 117945.32 99763.52  3.15400698 3.1540070 0.32271361
##                  ACF1 Theil's U
## Training set 0.111312        NA
## Test set     0.792704  1.979946
checkresiduals(arima_md1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)(1,0,1)[12]
## Q* = 27.838, df = 13, p-value = 0.009533
## 
## Model df: 2.   Total lags used: 15
arima_md2 <- arima(train, order = c(0,1,0), seasonal = list(order = c(2,0,0)))
fc_arima_md2 <- forecast(arima_md2, h = 12)
accuracy(fc_arima_md2, test)
##                     ME     RMSE      MAE         MPE      MAPE       MASE
## Training set -2348.268 23959.96 18481.16 -0.03262915 0.4552796 0.05978259
## Test set     77603.814 94571.44 77603.81  2.44832377 2.4483238 0.25103171
##                    ACF1 Theil's U
## Training set 0.06062383        NA
## Test set     0.80251962  1.583988
checkresiduals(arima_md2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)(2,0,0)[12]
## Q* = 28.414, df = 13, p-value = 0.007918
## 
## Model df: 2.   Total lags used: 15
arima_md3 <- arima(train, order = c(0,1,0), seasonal = list(order = c(1,0,0)))
fc_arima_md3 <- forecast(arima_md3, h = 12)
accuracy(fc_arima_md3, test)
##                     ME     RMSE      MAE         MPE      MAPE       MASE
## Training set -2342.975 24468.01 18414.04 -0.03549347 0.4542041 0.05956549
## Test set     60475.072 79395.09 61571.94  1.90080640 1.9363869 0.19917205
##                    ACF1 Theil's U
## Training set 0.02179282        NA
## Test set     0.82266318  1.326658
checkresiduals(fc_arima_md3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)(1,0,0)[12]
## Q* = 32.148, df = 14, p-value = 0.003816
## 
## Model df: 1.   Total lags used: 15

The third model SARIMA(0,1,0)(1,0,0) is the best in terms of MAPE, and its not overfitting (SARIMA(0,1,0)(2,0,0) would be in 2nd place and SARIMA(0,1,0)(1,0,1) is overfitting, because the MAPE of testing partition is much higher than in the training partition). If we check the residuals we can see that three models perform pretty well, series seems to be kind of white noise, and in all cases residuals seem to have a correlation with lag 11 (maybe because of a non-typical value or because the model wasn’t able to capture all the pattern of the series). The distribution of the three models’ residuals seems to be fairly normal. All in all the model that performs best in terms of MAPE and has the best proportion between the MAPE of the training and testing partitions, without overfitting is model 3 (SARIMA(0,1,0)(2,0,0))

test_forecast(paroesp_ts_2013, 
              forecast.obj = fc_arima_md1,
              test = test)
test_forecast(paroesp_ts_2013, 
              forecast.obj = fc_arima_md2,
              test = test)
test_forecast(paroesp_ts_2013, 
              forecast.obj = fc_arima_md3,
              test = test)

Here graphically we can confirm our suspicions, and the third model is the best of all three. We compare this model to th output that gives us the auto.arima() function, which determines optimal values (not always the best), to fit an ARIMA model in a time series.

auto <- auto.arima(train) 
## Warning in auto.arima(train): Having 3 or more differencing operations is
## not recommended. Please consider reducing the total number of differences.
fc_auto <- forecast(auto, h = 12)
accuracy(fc_auto, test)
##                     ME     RMSE      MAE       MPE      MAPE       MASE
## Training set  5110.271 21190.24 15544.55 0.1319751 0.3974555 0.05028329
## Test set     58344.164 69345.43 58344.16 1.8469402 1.8469402 0.18873087
##                    ACF1 Theil's U
## Training set 0.01097928        NA
## Test set     0.77567459  1.167235
checkresiduals(auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,2,1)(0,1,1)[12]
## Q* = 14.358, df = 13, p-value = 0.3491
## 
## Model df: 2.   Total lags used: 15
test_forecast(actual = paroesp_ts_2013,
              forecast.obj = fc_auto,
              test = test)

The auto.arima() function recommends us to fit an ARIMA(0,2,1)(0,1,1) with an AIC score of 1393, seems to not be overfitting. If we observe the residuals, we can see they are not normally distributed, the lags seem to not be correlated, and the Ljung Box Test indicates us that they are independent.

Deciding the final model

I have tried various models and the only one that seems to fit fairly well the series is the SARIMA(0,1,0)(1,0,0). Even the auto.arima model performs better, I don’t think that represents the true behaviour of the series (because before I analyzed the effects of doing a second differentiation, and they resulted in more variance to the series). Also this auto generated model has 3 orders of differencing, when it is reccomended not to exceed the 2 orders of differencing. That’s why I prefer to use my SARIMA(0,1,0)(1,0,0) model to forecast the final result. Even that in the residuals analysis, the SARIMA(0,1,0)(1,0,0) seems to have some degree of correlation with the 11th lag, I think that relationship may be caused by chance. On the other hand the residuals of the SARIMA(0,2,1)(0,1,1) and their lags seem to not be correlated, they are much more variant and they aren’t definetely normally distributed. For all these reasons the model I choose as the best or as the less bad is the SARIMA(0,1,0)(1,0,0).

Final forecast

paroesp13_best_md <- arima(paroesp_ts_2013, order = c(0, 1, 0), seasonal = list(order = c(1, 0, 0)))
fc_test_best <- forecast(paroesp13_best_md, h = 12)
plot_forecast(fc_test_best,
              title = "Forecast of Unemployed people in Spain using SARIMA(0,1,0)(1,0,0)",
              Xtitle = "Year",
              Ytitle = "Number of Unemployed people")
library(ggplot2)
library(ggfortify)
startTime <- as.Date("2018-01-01")
endTime <- as.Date("2021-01-04")
start.end <- c(startTime, endTime)
autoplot(fc_test_best) + ggtitle("Number of Unemployed people in Spain - Forecast using SARIMA(0,1,0)(1,0,0)") + xlab("Year") + ylab("Number of Unemployed people") + theme_replace() + scale_x_date(limits = start.end) + 
        ylim(c(2750000, 3600000))
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Warning: Removed 60 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).